import pandas as pd
import dash
import dash_core_components as dcc
import dash_html_components as html
import plotly.express as px
import plotly.graph_objects as go
from urllib.request import urlopen
import json
First, obtain JSON data for choropleth map.
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
counties = json.load(response)
Because the map maps FIPS data to identify counties, the datasets with Georgia counties also need to have the FIPS codes. FIPS were obtained and appended to the dataset in 'get-fips.ipynb'.
county_df = pd.read_csv('countyFIPS0423.csv', dtype={"FIPS": str})
county_df.head()
The choropleth map shows which areas have the most numbers of cases relative to each other. Hover for exact numbers.
fig1 = px.choropleth(county_df, geojson=counties, locations='FIPS', color='Cases',
color_continuous_scale="Redor",
range_color=(0, county_df["Cases"].max()),
scope="usa",
hover_name = "Name",
hover_data = ["Cases"],
labels={'Cases':'Cases'}
)
fig1.update_geos(fitbounds="locations", visible=False)
fig1.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig1.show()
tests_df = pd.read_csv('testing0423.csv')
#Set first column as index for easier data extraction
#tests_df.set_index(tests_df.columns[0], inplace=True)
tests_df
Show tests administered by commercial and non-commercial labs as a propotion of the total. As we can see, commercial labs have administered significantly more tests than the GPHL.
pie_total_tests = px.pie(tests_df,
title="Tests Administered by Entity",
values='Total Tests',
names=['Commercial Laboratories', 'Georgia Public Heath Laboratory'],
#names = 'Entity',
hover_name = ['Commercial Laboratories', 'Georgia Public Heath Laboratory'],
hover_data = ['Total Tests'],
)
pie_total_tests.show()
Show the proportion of tests that were positive and negative.
bar_testing = go.Figure(data=[
go.Bar(name='Negative Tests', x=tests_df['Entity'], y=tests_df['Total Tests']-tests_df['Positive Tests'], marker_color='mediumseagreen'),
go.Bar(name='Positive Tests', x=tests_df['Entity'], y=tests_df['Positive Tests'], marker_color='firebrick')
])
# Change the bar mode
bar_testing.update_layout(barmode='stack')
Georgia began releasing information about the races of people affected by COVID-19. While most are unknown or unrecorded, we can get insights about people of known races.
race_df = pd.read_csv('race0423.csv')
race_df.head()
Show the number of cases per race as a proportion of the total. Note that you can click races on the key to add/remove them. Of those whose races are known, we can see that Black or African-American people have been most affected.
pie_total_race = px.pie(race_df,
title="Total Cases By Race",
values='Cases',
names='Race'
)
pie_total_race.update_traces(textposition='outside', textinfo='percent+label')
pie_total_race.show()
The dataset also includes information about ethnicity (whether Hispanic, non-Hispanic, or unknown). This stacked bargraph will show the total cases by both race and ethnicity. People of unknown race have been excluded.
known_race = race_df.loc[race_df['Race'] != "Unknown"]
fig = px.bar(known_race,
title = "Total Cases by Race and Ethnicity (When Known)",
x="Race",
y="Cases",
color = "Ethnicity")
fig.show()
This histogram shows the distribution of the ages at death form COVID-19 by gender. A box and whisker plot is maginally added for further information. As we can see, males have been dying younger than females.
deaths_df = pd.read_csv('deaths0423.csv')
fig = px.histogram(deaths_df, title="Georgia COVID-19 Deaths by Age",
x="Age",
color="Gender",
marginal="box", #box-and-whiskers distribution
color_discrete_map = {'MALE': 'navy', 'FEMALE': ' MediumVioletRed', 'UNKNOWN':'black'})
fig.show()
#TODO: Add statistical tests to see if there differences are statistically significant
The vast majority of people who have died had known underlying conditions.
deaths_df['Underlying'].value_counts()
underlying = px.pie(deaths_df,
title="Presence of Underlying Conditions Among the Deceased",
values=deaths_df['Underlying'].value_counts(),
names=['Underlying condition present', 'Unknown condition', 'Underlying condition not present'],
#hover_name = ['Underlying condition present', 'Unknown condition', 'Underlying condition not present'],
#hover_data = ['Underlying'].value_counts()
)
underlying.update_traces(textposition='outside', textinfo='percent+label')
underlying.show()
Verify that the labels were labeled correctly.
deaths_df['Underlying'].value_counts()
Which counties have had the most deaths as a proportion of the total number of cases? I previously added a "Percent Deaths" column to the deaths data set. Here I will make another choropleth map to show this data. Hover data also includes the total number of cases and deaths for better perspective.
deaths_map = px.choropleth(county_df, geojson=counties, locations='FIPS', color='Percent Deaths',
color_continuous_scale="YlOrRd",
range_color=(0, county_df["Percent Deaths"].max()),
scope="usa",
hover_name = "Name",
hover_data = ["Cases","Deaths","Percent Deaths"]
#labels={'Cases':'Cases'}
)
deaths_map.update_geos(fitbounds="locations", visible=False)
deaths_map.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
deaths_map.show()
import glob
Function to merge datasets when necessary
#Function to append dates to dataframes then merge the data sets of that type together
#This is to make it easier to track changes over time
#Input MUST be a string in the format 'dataset*.csv' (example: 'summary*.csv')
# OR a valid date can replace the * if only one file is needed.
def track_dates(file_names):
#empty DataFrame by which dataframes will be stacked by date
merged_df = pd.DataFrame()
for file in glob.glob(file_names):
#take date from file names
date = file[-8:-4]
#append new Date column to that data
df = pd.read_csv(file)
df = df.assign(Date = date)
merged_df = merged_df.append(df)
return merged_df
tracked_cases = track_dates('summary*.csv')
#Verify that the files have merged with dates
tracked_cases
tracked_total = tracked_cases.loc[tracked_cases["Status"]=="Total"]
tracked_deaths = tracked_cases.loc[tracked_cases["Status"]=="Deaths"]
tracked_hospitalized = tracked_cases.loc[tracked_cases["Status"]=="Hospitalized"]
fig = go.Figure()
fig.add_trace(go.Scatter(x=tracked_total['Date'], y=tracked_total['Confirmed Cases'],
mode='lines+markers',
name='Total Cases',
text = ["April 23","April 24","April 25","April 26","April 27","April 28"]))
fig.add_trace(go.Scatter(x=tracked_hospitalized['Date'], y=tracked_hospitalized['Confirmed Cases'],
mode='lines+markers',
name='Total Hospitalized',
text = ["April 23","April 24","April 25","April 26","April 27","April 28"]))
fig.add_trace(go.Scatter(x=tracked_deaths['Date'], y=tracked_deaths['Confirmed Cases'],
mode='lines+markers',
name='Total Deaths',
text = ["April 23","April 24","April 25","April 26","April 27","April 28"]))
fig.show()